The data is about Coronavirus (COVID-19) Vaccinations
The website states that the vaccine dataset is based on the most current official data from international health ministries and governments. The United Nations World Population Prospects serve as the basis for population estimates used in per-capita measurements. Based on the World Bank classification, income groups are created. Source: https://ourworldindata.org/covid-vaccinations
# Load required libraries
library(tidyverse)
library(lares) # show the structure of data
library(vtable) # show the mean, sd, min, max
library(caret)
library(psycho) # Standardizing
library('DescTools')
library(ggcorrplot)
library(plotly) # Using plotly for interactive plots
library(fastDummies) # for dummy encoding
# Loading the data after downloading
data <- read.csv("owid-covid-data.csv")
dim(data)
## [1] 351787 67
# Overview of the data
head(data, 5)
## iso_code continent location date total_cases new_cases
## 1 AFG Asia Afghanistan 2020-01-03 NA 0
## 2 AFG Asia Afghanistan 2020-01-04 NA 0
## 3 AFG Asia Afghanistan 2020-01-05 NA 0
## 4 AFG Asia Afghanistan 2020-01-06 NA 0
## 5 AFG Asia Afghanistan 2020-01-07 NA 0
## new_cases_smoothed total_deaths new_deaths new_deaths_smoothed
## 1 NA NA 0 NA
## 2 NA NA 0 NA
## 3 NA NA 0 NA
## 4 NA NA 0 NA
## 5 NA NA 0 NA
## total_cases_per_million new_cases_per_million new_cases_smoothed_per_million
## 1 NA 0 NA
## 2 NA 0 NA
## 3 NA 0 NA
## 4 NA 0 NA
## 5 NA 0 NA
## total_deaths_per_million new_deaths_per_million
## 1 NA 0
## 2 NA 0
## 3 NA 0
## 4 NA 0
## 5 NA 0
## new_deaths_smoothed_per_million reproduction_rate icu_patients
## 1 NA NA NA
## 2 NA NA NA
## 3 NA NA NA
## 4 NA NA NA
## 5 NA NA NA
## icu_patients_per_million hosp_patients hosp_patients_per_million
## 1 NA NA NA
## 2 NA NA NA
## 3 NA NA NA
## 4 NA NA NA
## 5 NA NA NA
## weekly_icu_admissions weekly_icu_admissions_per_million
## 1 NA NA
## 2 NA NA
## 3 NA NA
## 4 NA NA
## 5 NA NA
## weekly_hosp_admissions weekly_hosp_admissions_per_million total_tests
## 1 NA NA NA
## 2 NA NA NA
## 3 NA NA NA
## 4 NA NA NA
## 5 NA NA NA
## new_tests total_tests_per_thousand new_tests_per_thousand new_tests_smoothed
## 1 NA NA NA NA
## 2 NA NA NA NA
## 3 NA NA NA NA
## 4 NA NA NA NA
## 5 NA NA NA NA
## new_tests_smoothed_per_thousand positive_rate tests_per_case tests_units
## 1 NA NA NA
## 2 NA NA NA
## 3 NA NA NA
## 4 NA NA NA
## 5 NA NA NA
## total_vaccinations people_vaccinated people_fully_vaccinated total_boosters
## 1 NA NA NA NA
## 2 NA NA NA NA
## 3 NA NA NA NA
## 4 NA NA NA NA
## 5 NA NA NA NA
## new_vaccinations new_vaccinations_smoothed total_vaccinations_per_hundred
## 1 NA NA NA
## 2 NA NA NA
## 3 NA NA NA
## 4 NA NA NA
## 5 NA NA NA
## people_vaccinated_per_hundred people_fully_vaccinated_per_hundred
## 1 NA NA
## 2 NA NA
## 3 NA NA
## 4 NA NA
## 5 NA NA
## total_boosters_per_hundred new_vaccinations_smoothed_per_million
## 1 NA NA
## 2 NA NA
## 3 NA NA
## 4 NA NA
## 5 NA NA
## new_people_vaccinated_smoothed new_people_vaccinated_smoothed_per_hundred
## 1 NA NA
## 2 NA NA
## 3 NA NA
## 4 NA NA
## 5 NA NA
## stringency_index population_density median_age aged_65_older aged_70_older
## 1 0 54.422 18.6 2.581 1.337
## 2 0 54.422 18.6 2.581 1.337
## 3 0 54.422 18.6 2.581 1.337
## 4 0 54.422 18.6 2.581 1.337
## 5 0 54.422 18.6 2.581 1.337
## gdp_per_capita extreme_poverty cardiovasc_death_rate diabetes_prevalence
## 1 1803.987 NA 597.029 9.59
## 2 1803.987 NA 597.029 9.59
## 3 1803.987 NA 597.029 9.59
## 4 1803.987 NA 597.029 9.59
## 5 1803.987 NA 597.029 9.59
## female_smokers male_smokers handwashing_facilities hospital_beds_per_thousand
## 1 NA NA 37.746 0.5
## 2 NA NA 37.746 0.5
## 3 NA NA 37.746 0.5
## 4 NA NA 37.746 0.5
## 5 NA NA 37.746 0.5
## life_expectancy human_development_index population
## 1 64.83 0.511 41128772
## 2 64.83 0.511 41128772
## 3 64.83 0.511 41128772
## 4 64.83 0.511 41128772
## 5 64.83 0.511 41128772
## excess_mortality_cumulative_absolute excess_mortality_cumulative
## 1 NA NA
## 2 NA NA
## 3 NA NA
## 4 NA NA
## 5 NA NA
## excess_mortality excess_mortality_cumulative_per_million
## 1 NA NA
## 2 NA NA
## 3 NA NA
## 4 NA NA
## 5 NA NA
str(data)
## 'data.frame': 351787 obs. of 67 variables:
## $ iso_code : chr "AFG" "AFG" "AFG" "AFG" ...
## $ continent : chr "Asia" "Asia" "Asia" "Asia" ...
## $ location : chr "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
## $ date : chr "2020-01-03" "2020-01-04" "2020-01-05" "2020-01-06" ...
## $ total_cases : num NA NA NA NA NA NA NA NA NA NA ...
## $ new_cases : num 0 0 0 0 0 0 0 0 0 0 ...
## $ new_cases_smoothed : num NA NA NA NA NA 0 0 0 0 0 ...
## $ total_deaths : num NA NA NA NA NA NA NA NA NA NA ...
## $ new_deaths : num 0 0 0 0 0 0 0 0 0 0 ...
## $ new_deaths_smoothed : num NA NA NA NA NA 0 0 0 0 0 ...
## $ total_cases_per_million : num NA NA NA NA NA NA NA NA NA NA ...
## $ new_cases_per_million : num 0 0 0 0 0 0 0 0 0 0 ...
## $ new_cases_smoothed_per_million : num NA NA NA NA NA 0 0 0 0 0 ...
## $ total_deaths_per_million : num NA NA NA NA NA NA NA NA NA NA ...
## $ new_deaths_per_million : num 0 0 0 0 0 0 0 0 0 0 ...
## $ new_deaths_smoothed_per_million : num NA NA NA NA NA 0 0 0 0 0 ...
## $ reproduction_rate : num NA NA NA NA NA NA NA NA NA NA ...
## $ icu_patients : num NA NA NA NA NA NA NA NA NA NA ...
## $ icu_patients_per_million : num NA NA NA NA NA NA NA NA NA NA ...
## $ hosp_patients : num NA NA NA NA NA NA NA NA NA NA ...
## $ hosp_patients_per_million : num NA NA NA NA NA NA NA NA NA NA ...
## $ weekly_icu_admissions : num NA NA NA NA NA NA NA NA NA NA ...
## $ weekly_icu_admissions_per_million : num NA NA NA NA NA NA NA NA NA NA ...
## $ weekly_hosp_admissions : num NA NA NA NA NA NA NA NA NA NA ...
## $ weekly_hosp_admissions_per_million : num NA NA NA NA NA NA NA NA NA NA ...
## $ total_tests : num NA NA NA NA NA NA NA NA NA NA ...
## $ new_tests : num NA NA NA NA NA NA NA NA NA NA ...
## $ total_tests_per_thousand : num NA NA NA NA NA NA NA NA NA NA ...
## $ new_tests_per_thousand : num NA NA NA NA NA NA NA NA NA NA ...
## $ new_tests_smoothed : num NA NA NA NA NA NA NA NA NA NA ...
## $ new_tests_smoothed_per_thousand : num NA NA NA NA NA NA NA NA NA NA ...
## $ positive_rate : num NA NA NA NA NA NA NA NA NA NA ...
## $ tests_per_case : num NA NA NA NA NA NA NA NA NA NA ...
## $ tests_units : chr "" "" "" "" ...
## $ total_vaccinations : num NA NA NA NA NA NA NA NA NA NA ...
## $ people_vaccinated : num NA NA NA NA NA NA NA NA NA NA ...
## $ people_fully_vaccinated : num NA NA NA NA NA NA NA NA NA NA ...
## $ total_boosters : num NA NA NA NA NA NA NA NA NA NA ...
## $ new_vaccinations : num NA NA NA NA NA NA NA NA NA NA ...
## $ new_vaccinations_smoothed : num NA NA NA NA NA NA NA NA NA NA ...
## $ total_vaccinations_per_hundred : num NA NA NA NA NA NA NA NA NA NA ...
## $ people_vaccinated_per_hundred : num NA NA NA NA NA NA NA NA NA NA ...
## $ people_fully_vaccinated_per_hundred : num NA NA NA NA NA NA NA NA NA NA ...
## $ total_boosters_per_hundred : num NA NA NA NA NA NA NA NA NA NA ...
## $ new_vaccinations_smoothed_per_million : num NA NA NA NA NA NA NA NA NA NA ...
## $ new_people_vaccinated_smoothed : num NA NA NA NA NA NA NA NA NA NA ...
## $ new_people_vaccinated_smoothed_per_hundred: num NA NA NA NA NA NA NA NA NA NA ...
## $ stringency_index : num 0 0 0 0 0 0 0 0 0 0 ...
## $ population_density : num 54.4 54.4 54.4 54.4 54.4 ...
## $ median_age : num 18.6 18.6 18.6 18.6 18.6 18.6 18.6 18.6 18.6 18.6 ...
## $ aged_65_older : num 2.58 2.58 2.58 2.58 2.58 ...
## $ aged_70_older : num 1.34 1.34 1.34 1.34 1.34 ...
## $ gdp_per_capita : num 1804 1804 1804 1804 1804 ...
## $ extreme_poverty : num NA NA NA NA NA NA NA NA NA NA ...
## $ cardiovasc_death_rate : num 597 597 597 597 597 ...
## $ diabetes_prevalence : num 9.59 9.59 9.59 9.59 9.59 9.59 9.59 9.59 9.59 9.59 ...
## $ female_smokers : num NA NA NA NA NA NA NA NA NA NA ...
## $ male_smokers : num NA NA NA NA NA NA NA NA NA NA ...
## $ handwashing_facilities : num 37.7 37.7 37.7 37.7 37.7 ...
## $ hospital_beds_per_thousand : num 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 ...
## $ life_expectancy : num 64.8 64.8 64.8 64.8 64.8 ...
## $ human_development_index : num 0.511 0.511 0.511 0.511 0.511 0.511 0.511 0.511 0.511 0.511 ...
## $ population : num 41128772 41128772 41128772 41128772 41128772 ...
## $ excess_mortality_cumulative_absolute : num NA NA NA NA NA NA NA NA NA NA ...
## $ excess_mortality_cumulative : num NA NA NA NA NA NA NA NA NA NA ...
## $ excess_mortality : num NA NA NA NA NA NA NA NA NA NA ...
## $ excess_mortality_cumulative_per_million : num NA NA NA NA NA NA NA NA NA NA ...
# Check for null values in the dataframe
is_null <- is.na(data)
# Count the number of null values in each column
num_null_values <- colSums(is_null, na.rm = TRUE)
# Print the number of null values in each column
print(num_null_values)
## iso_code
## 0
## continent
## 0
## location
## 0
## date
## 0
## total_cases
## 37991
## new_cases
## 9611
## new_cases_smoothed
## 10870
## total_deaths
## 59620
## new_deaths
## 9560
## new_deaths_smoothed
## 10790
## total_cases_per_million
## 37991
## new_cases_per_million
## 9611
## new_cases_smoothed_per_million
## 10870
## total_deaths_per_million
## 59620
## new_deaths_per_million
## 9560
## new_deaths_smoothed_per_million
## 10790
## reproduction_rate
## 166970
## icu_patients
## 314132
## icu_patients_per_million
## 314132
## hosp_patients
## 312824
## hosp_patients_per_million
## 312824
## weekly_icu_admissions
## 341552
## weekly_icu_admissions_per_million
## 341552
## weekly_hosp_admissions
## 328484
## weekly_hosp_admissions_per_million
## 328484
## total_tests
## 272400
## new_tests
## 276384
## total_tests_per_thousand
## 272400
## new_tests_per_thousand
## 276384
## new_tests_smoothed
## 247822
## new_tests_smoothed_per_thousand
## 247822
## positive_rate
## 255860
## tests_per_case
## 257439
## tests_units
## 0
## total_vaccinations
## 272338
## people_vaccinated
## 275742
## people_fully_vaccinated
## 279073
## total_boosters
## 304093
## new_vaccinations
## 286321
## new_vaccinations_smoothed
## 169536
## total_vaccinations_per_hundred
## 272338
## people_vaccinated_per_hundred
## 275742
## people_fully_vaccinated_per_hundred
## 279073
## total_boosters_per_hundred
## 304093
## new_vaccinations_smoothed_per_million
## 169536
## new_people_vaccinated_smoothed
## 169743
## new_people_vaccinated_smoothed_per_hundred
## 169743
## stringency_index
## 154136
## population_density
## 53139
## median_age
## 74062
## aged_65_older
## 83763
## aged_70_older
## 76846
## gdp_per_capita
## 79587
## extreme_poverty
## 176357
## cardiovasc_death_rate
## 78952
## diabetes_prevalence
## 65070
## female_smokers
## 147116
## male_smokers
## 149900
## handwashing_facilities
## 218141
## hospital_beds_per_thousand
## 110924
## life_expectancy
## 28126
## human_development_index
## 87343
## population
## 0
## excess_mortality_cumulative_absolute
## 339582
## excess_mortality_cumulative
## 339582
## excess_mortality
## 339582
## excess_mortality_cumulative_per_million
## 339582
# show the details of the dataset
df_str(data, return = "plot")
As we can see from the above output, there are columns with a very high number of null values. So the first thing would be to drop all the columns whose number of null values is more more than half the total number of rows, or observations.
**Sometimes, R do not read empty strings, and question marks as nulls. So we first convert the Question marks and the empty strings as nulls then check the number of null values.
# Replace question marks with NA in the entire dataframe
data[data == "?"] <- NA
# Replace empty strings with NA in the entire dataframe
data[data == ""] <- NA
# Check for null values in each column
null_counts <- colSums(is.na(data))
# Get columns where more than half of the values are null
columns_to_drop <- names(null_counts[null_counts > nrow(data)/2])
# Drop columns with more than half of the values being null
data <- data[, !names(data) %in% columns_to_drop]
dim(data)
## [1] 351787 35
# Drop rows with NA values using na.omit()
data <- na.omit(data)
# Check for null values in the dataframe
is_null <- is.na(data)
# Count the number of null values in each column
num_null_values <- colSums(is_null, na.rm = TRUE)
# Print the number of null values in each column
print(num_null_values)
## iso_code
## 0
## continent
## 0
## location
## 0
## date
## 0
## total_cases
## 0
## new_cases
## 0
## new_cases_smoothed
## 0
## total_deaths
## 0
## new_deaths
## 0
## new_deaths_smoothed
## 0
## total_cases_per_million
## 0
## new_cases_per_million
## 0
## new_cases_smoothed_per_million
## 0
## total_deaths_per_million
## 0
## new_deaths_per_million
## 0
## new_deaths_smoothed_per_million
## 0
## reproduction_rate
## 0
## new_vaccinations_smoothed
## 0
## new_vaccinations_smoothed_per_million
## 0
## new_people_vaccinated_smoothed
## 0
## new_people_vaccinated_smoothed_per_hundred
## 0
## stringency_index
## 0
## population_density
## 0
## median_age
## 0
## aged_65_older
## 0
## aged_70_older
## 0
## gdp_per_capita
## 0
## cardiovasc_death_rate
## 0
## diabetes_prevalence
## 0
## female_smokers
## 0
## male_smokers
## 0
## hospital_beds_per_thousand
## 0
## life_expectancy
## 0
## human_development_index
## 0
## population
## 0
View(data)
We can now see that we do not have any null values in the columns.
# Drop rows with NA values using na.omit()
data_clean <- na.omit(data)
# checking the shape of the data
cat('The shape of the data:', dim(data_clean)[1], 'rows/observations', 'and', dim(data_clean)[2], 'columns')
## The shape of the data: 80318 rows/observations and 35 columns
The Dataset is already eligible for our analysis as it means the eligibility of 7000 and 10 columns
# Removing a single column using subset() function
data <- subset(data, select = -c(population, human_development_index, life_expectancy, hospital_beds_per_thousand, male_smokers, female_smokers,
diabetes_prevalence, cardiovasc_death_rate, gdp_per_capita, aged_70_older, aged_65_older, median_age, population_density, stringency_index, new_people_vaccinated_smoothed_per_hundred, new_vaccinations_smoothed_per_million, new_deaths_smoothed_per_million, new_deaths_per_million, total_deaths_per_million, new_cases_smoothed_per_million, new_cases_per_million, total_cases_per_million))
dim(data)
## [1] 80318 13
# Remove all instances of 0 from all columns
data_no_zeros <- data
data_no_zeros[data_no_zeros == 0] <- NA
data <- na.omit(data_no_zeros)
# Check data types of variables
str(data)
## 'data.frame': 43428 obs. of 13 variables:
## $ iso_code : chr "ALB" "ALB" "ALB" "ALB" ...
## $ continent : chr "Europe" "Europe" "Europe" "Europe" ...
## $ location : chr "Albania" "Albania" "Albania" "Albania" ...
## $ date : chr "2021-01-11" "2021-01-12" "2021-01-13" "2021-01-14" ...
## $ total_cases : num 63033 63595 63971 64627 65334 ...
## $ new_cases : num 655 562 376 656 707 660 641 581 474 292 ...
## $ new_cases_smoothed : num 577 594 621 621 618 ...
## $ total_deaths : num 1233 1241 1247 1252 1256 ...
## $ new_deaths : num 3 8 6 5 4 5 4 5 7 4 ...
## $ new_deaths_smoothed : num 6.14 6.86 6.86 6 5.57 ...
## $ reproduction_rate : num 1.06 1.07 1.07 1.07 1.07 1.07 1.07 1.08 1.09 1.1 ...
## $ new_vaccinations_smoothed : num 64 64 63 66 62 62 58 55 51 47 ...
## $ new_people_vaccinated_smoothed: num 64 64 63 66 62 62 58 55 51 47 ...
## - attr(*, "na.action")= 'omit' Named int [1:36890] 18 19 20 21 22 23 138 139 144 145 ...
## ..- attr(*, "names")= chr [1:36890] "3176" "3177" "3178" "3179" ...
# show the details of the dataset
df_str(data, return = "plot")
-Categorical Variables
-Discrete Variables
-Continous Variables
# Select numerical columns using indexing
numerical_data <- data[sapply(data, is.numeric)]
numerical_data %>%
pivot_longer(everything()) %>%
group_by(name) %>%
summarise_at(vars(value), list(Mean = mean, Median = median, Min = min, Max = max, Sd = sd))
## # A tibble: 9 × 6
## name Mean Median Min Max Sd
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 new_cases 13950. 1545 1 6.97e6 1.04e+5
## 2 new_cases_smoothed 11284. 1497. 0.143 5.88e6 8.82e+4
## 3 new_deaths 101. 14 1 1.14e4 3.30e+2
## 4 new_deaths_smoothed 90.1 13.1 0.143 4.19e3 2.83e+2
## 5 new_people_vaccinated_smoothed 87454. 7177 1 6.79e6 3.77e+5
## 6 new_vaccinations_smoothed 214364. 25939 1 2.24e7 8.55e+5
## 7 reproduction_rate 0.994 0.97 -0.03 4.22e0 2.95e-1
## 8 total_cases 4084958. 839881 129 9.94e7 9.96e+6
## 9 total_deaths 61070. 12374. 1 1.08e6 1.40e+5
# getting the summary statistics
st(numerical_data)
| Variable | N | Mean | Std. Dev. | Min | Pctl. 25 | Pctl. 75 | Max |
|---|---|---|---|---|---|---|---|
| total_cases | 43428 | 4084958 | 9958820 | 129 | 278548 | 3488102 | 99411696 |
| new_cases | 43428 | 13950 | 103615 | 1 | 406 | 6422 | 6966046 |
| new_cases_smoothed | 43428 | 11284 | 88184 | 0.14 | 410 | 5975 | 5882129 |
| total_deaths | 43428 | 61070 | 140487 | 1 | 3583 | 36870 | 1082456 |
| new_deaths | 43428 | 101 | 330 | 1 | 4 | 56 | 11447 |
| new_deaths_smoothed | 43428 | 90 | 283 | 0.14 | 3.6 | 54 | 4190 |
| reproduction_rate | 43428 | 0.99 | 0.29 | -0.03 | 0.81 | 1.1 | 4.2 |
| new_vaccinations_smoothed | 43428 | 214364 | 854845 | 1 | 5654 | 120608 | 22424286 |
| new_people_vaccinated_smoothed | 43428 | 87454 | 377164 | 1 | 1042 | 37637 | 6785334 |
# Apply Min-Max Normalization
preproc <- preProcess(numerical_data, method = c("range"))
scaled_data_minmax <- predict(preproc, numerical_data)
head(scaled_data_minmax, 10)
## total_cases new_cases new_cases_smoothed total_deaths new_deaths
## 3159 0.0006327634 9.388398e-05 9.814237e-05 0.001138154 0.0001747335
## 3160 0.0006384167 8.053350e-05 1.009352e-04 0.001145544 0.0006115674
## 3161 0.0006421989 5.383255e-05 1.055740e-04 0.001151087 0.0004368338
## 3162 0.0006487977 9.402753e-05 1.054768e-04 0.001155706 0.0003494671
## 3163 0.0006559096 1.013488e-04 1.050397e-04 0.001159402 0.0002621003
## 3164 0.0006625487 9.460174e-05 1.041410e-04 0.001164021 0.0003494671
## 3165 0.0006689966 9.187423e-05 1.033639e-04 0.001167716 0.0002621003
## 3166 0.0006748410 8.326102e-05 1.015666e-04 0.001172335 0.0003494671
## 3167 0.0006796090 6.790080e-05 9.942948e-05 0.001178802 0.0005242006
## 3168 0.0006825463 4.177406e-05 9.738941e-05 0.001182497 0.0002621003
## new_deaths_smoothed reproduction_rate new_vaccinations_smoothed
## 3159 0.001432030 0.2564706 2.809454e-06
## 3160 0.001602441 0.2588235 2.809454e-06
## 3161 0.001602441 0.2588235 2.764860e-06
## 3162 0.001397900 0.2588235 2.898643e-06
## 3163 0.001295510 0.2588235 2.720265e-06
## 3164 0.001261618 0.2588235 2.720265e-06
## 3165 0.001159228 0.2588235 2.541887e-06
## 3166 0.001227488 0.2611765 2.408104e-06
## 3167 0.001193358 0.2635294 2.229725e-06
## 3168 0.001125098 0.2658824 2.051347e-06
## new_people_vaccinated_smoothed
## 3159 9.284732e-06
## 3160 9.284732e-06
## 3161 9.137356e-06
## 3162 9.579486e-06
## 3163 8.989979e-06
## 3164 8.989979e-06
## 3165 8.400472e-06
## 3166 7.958342e-06
## 3167 7.368835e-06
## 3168 6.779328e-06
# Z-score standardization
z_score_standardized_data <- as.data.frame(scale(numerical_data))
head(z_score_standardized_data, 10)
## total_cases new_cases new_cases_smoothed total_deaths new_deaths
## 3159 -0.4038556 -0.1283085 -0.1214125 -0.4259262 -0.2968933
## 3160 -0.4037992 -0.1292061 -0.1212263 -0.4258692 -0.2817319
## 3161 -0.4037614 -0.1310012 -0.1209168 -0.4258265 -0.2877964
## 3162 -0.4036955 -0.1282989 -0.1209233 -0.4257909 -0.2908287
## 3163 -0.4036245 -0.1278067 -0.1209525 -0.4257624 -0.2938610
## 3164 -0.4035583 -0.1282603 -0.1210124 -0.4257269 -0.2908287
## 3165 -0.4034939 -0.1284437 -0.1210642 -0.4256984 -0.2938610
## 3166 -0.4034356 -0.1290227 -0.1211841 -0.4256628 -0.2908287
## 3167 -0.4033880 -0.1300554 -0.1213267 -0.4256130 -0.2847641
## 3168 -0.4033587 -0.1318119 -0.1214628 -0.4255845 -0.2938610
## new_deaths_smoothed reproduction_rate new_vaccinations_smoothed
## 3159 -0.2971834 0.2249132 -0.2506890
## 3160 -0.2946569 0.2588510 -0.2506890
## 3161 -0.2946569 0.2588510 -0.2506902
## 3162 -0.2976894 0.2588510 -0.2506867
## 3163 -0.2992074 0.2588510 -0.2506914
## 3164 -0.2997099 0.2588510 -0.2506914
## 3165 -0.3012279 0.2588510 -0.2506960
## 3166 -0.3002159 0.2927888 -0.2506996
## 3167 -0.3007219 0.3267265 -0.2507042
## 3168 -0.3017339 0.3606643 -0.2507089
## new_people_vaccinated_smoothed
## 3159 -0.2317030
## 3160 -0.2317030
## 3161 -0.2317056
## 3162 -0.2316977
## 3163 -0.2317083
## 3164 -0.2317083
## 3165 -0.2317189
## 3166 -0.2317268
## 3167 -0.2317374
## 3168 -0.2317480
# Robust Scalar
robust_scalar <- function(x){(x- median(x)) /(quantile(x,probs = .75)-quantile(x,probs = .25))}
robust_scalar_data <- as.data.frame(sapply(numerical_data, robust_scalar))
head(robust_scalar_data, 10)
## total_cases new_cases new_cases_smoothed total_deaths new_deaths
## 1 -0.2420424 -0.1479388 -0.1652539 -0.3347128 -0.2115385
## 2 -0.2418673 -0.1633976 -0.1623023 -0.3344724 -0.1153846
## 3 -0.2417501 -0.1943152 -0.1574000 -0.3342922 -0.1538462
## 4 -0.2415457 -0.1477726 -0.1575028 -0.3341420 -0.1730769
## 5 -0.2413254 -0.1392952 -0.1579647 -0.3340218 -0.1923077
## 6 -0.2411198 -0.1471077 -0.1589144 -0.3338716 -0.1730769
## 7 -0.2409201 -0.1502660 -0.1597356 -0.3337514 -0.1923077
## 8 -0.2407391 -0.1602394 -0.1616351 -0.3336012 -0.1730769
## 9 -0.2405914 -0.1780253 -0.1638936 -0.3333909 -0.1346154
## 10 -0.2405004 -0.2082779 -0.1660496 -0.3332707 -0.1923077
## new_deaths_smoothed reproduction_rate new_vaccinations_smoothed
## 1 -0.14000 0.2647059 -0.2250905
## 2 -0.12572 0.2941176 -0.2250905
## 3 -0.12572 0.2941176 -0.2250992
## 4 -0.14286 0.2941176 -0.2250731
## 5 -0.15144 0.2941176 -0.2251079
## 6 -0.15428 0.2941176 -0.2251079
## 7 -0.16286 0.2941176 -0.2251427
## 8 -0.15714 0.3235294 -0.2251688
## 9 -0.16000 0.3529412 -0.2252036
## 10 -0.16572 0.3823529 -0.2252384
## new_people_vaccinated_smoothed
## 1 -0.1943695
## 2 -0.1943695
## 3 -0.1943968
## 4 -0.1943148
## 5 -0.1944241
## 6 -0.1944241
## 7 -0.1945334
## 8 -0.1946154
## 9 -0.1947247
## 10 -0.1948340
# Basic line plot
ggplot(data=data, aes(x=total_cases, y=total_deaths))+
geom_line(color= 'black') +
labs(title = "Scatter Plot of Total Cases vs Total Deaths")
# Convert 'date' column to Date type if it's not already
data$date <- as.Date(data$date)
# Plot 'new deaths' and 'new deaths smoothed' against dates with different colors for each line
ggplot(data, aes(x = date)) +
geom_line(aes(y = new_deaths, color = "New deaths")) +
geom_line(aes(y = new_deaths_smoothed, color = "New deaths smoothed")) +
scale_color_manual(values = c("New deaths" = "blue", "New deaths smoothed" = "green")) +
labs(title = "New Deaths and New Deaths Smoothed Over Time", x = "Date", y = "Number of Cases", color = "Legend") +
theme_minimal()
# Convert 'date' column to Date type if it's not already
ggplot(data, aes(x = date)) +
geom_line(aes(y = total_cases, color = "total cases")) +
geom_line(aes(y = total_deaths, color = "total deaths")) +
scale_color_manual(values = c("total cases" = "red", "total deaths" = "green")) +
labs(title = "Total Cases and Total Deaths Over Time", x = "Date", y = "Number of Cases", color = "Legend") +
theme_minimal()
# Plot scatter plot of 'total_cases' vs 'total_deaths' with different colors for each data point
ggplot(data, aes(x = total_cases, y = total_deaths, color = factor(continent))) +
geom_point(size = 0.5) +
labs(title = "Scatter Plot of Total Cases vs Total Deaths",
x = "Total Cases", y = "Total Deaths", color = "Continent") +
theme_minimal()
# Compute a correlation matrix
corr <- round(cor(numerical_data), 1)
# Compute a matrix of correlation p-values
p.mat <- cor_pmat(numerical_data)
# Reordering the correlation matrix using hierarchical clustering
ggcorrplot(corr, hc.order = TRUE)
# Barring the no significant coefficient
ggcorrplot(corr, hc.order = TRUE, p.mat = p.mat)
# Calculate the correlation matrix
correlation_matrix <- cor(numerical_data)
# Reshape the correlation matrix for plotting
correlation_data <- as.data.frame(as.table(correlation_matrix))
colnames(correlation_data) <- c("Feature1", "Feature2", "Correlation")
# Create a heatmap of the correlation matrix with rotated x-axis labels
ggplot(data = correlation_data, aes(x = Feature1, y = Feature2, fill = Correlation)) +
geom_tile() +
scale_fill_gradient(low = "white", high = "darkblue") +
labs(title = "Correlation Heatmap", x = "Feature 1", y = "Feature 2") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Group data by continent and calculate total deaths for each continent
continent_deaths <- data %>%
group_by(continent) %>%
summarise(total_deaths = sum(total_deaths, na.rm = TRUE))
# Create a bar chart of total deaths by continent
ggplot(data = continent_deaths, aes(x = continent, y = total_deaths, fill = continent)) +
geom_bar(stat = "identity") +
labs(title = "Total Deaths by Continent", x = "Continent", y = "Total Deaths") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Rotate x-axis labels for better readability
# Group data by continent and calculate total cases for each continent
continent_cases <- data %>%
group_by(continent) %>%
summarise(total_cases = sum(total_cases, na.rm = TRUE))
# Create a ber chart of total cases by continent
ggplot(data = continent_cases, aes(x = continent, y = total_cases, fill = continent)) +
geom_bar(stat = "identity") +
labs(title = "Total Cases by Continent", x = "Continent", y = "Total Cass") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Rotate x-axis labels for better readability
# Manually specify colors for the continents
continent_colors <- c("Asia" = "red", "Africa" = "orange", "Europe" = "green",
"North America" = "blue", "Oceania" = "purple", "South America" = "yellow")
# Plot line chart of 'total_cases' over time per continent with manual colors and increased transparency
ggplot(data, aes(x = date, y = total_cases, color = continent)) +
geom_line(alpha = 0.5) + # Set the alpha value (transparency) to 0.5
scale_color_manual(values = continent_colors) +
labs(title = "Total Cases Over Time by Continenet", x = "Date", y = "Total Cases", color = "Continent") +
theme_minimal()
ggplot(data, aes(x = date, y = total_cases, color = continent, linetype = continent)) +
geom_line() +
scale_color_manual(values = continent_colors) +
scale_linetype_manual(values = c("Asia" = "solid", "Africa" = "dashed", "Europe" = "dotted",
"North America" = "dotdash", "Oceania" = "longdash", "South America" = "twodash")) +
labs(title = "Total Cases Over Time by Continenet", x = "Date", y = "Total Cases", color = "Continent") +
theme_minimal()
ggplot(data, aes(x = date, y = total_cases, color = continent)) +
geom_line() +
scale_color_manual(values = continent_colors) +
facet_wrap(~continent, scales = "free_y") +
labs(title = "Total Cases Over Time by Continenet", x = "Date", y = "Total Cases", color = "Continent") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
plot_ly(data, x = ~date, y = ~total_cases, color = ~continent, type = "scatter", mode = "lines") %>%
layout(title = "Total Cases Over Time by Continent", xaxis = list(title = "Date"),
yaxis = list(title = "Total Cases"), legend = list(title = "Continent"))